INT104 code

Artificial intelligence (R)

Author

TC-tea

Published

2023.05.17

🚀

👉🏻Click to enter the INT104 coursework section

1 Cross validation accuracy

Question about 3-nearest-neighbors classifier

# Original dataframe
dtf <- matrix(c(0:9,
                1,0,1,0,1,0,1,0,1,0),
              nrow=2, byrow=TRUE)
rownames(dtf) <- c("X", "Y")
dtf

  [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
X    0    1    2    3    4    5    6    7    8     9
Y    1    0    1    0    1    0    1    0    1     0

# Split the data into two folds
fold1 <- dtf[, 1:5]
fold2 <- dtf[, 6:10]

# Perform 2-fold cross validation
accuracy <- numeric(2)
for (i in 1:2) {
  if (i == 1) {
    # Use fold1 as validation set and fold2 as training set
    for (j in 1:5) {
      dist <- abs(fold1[1, j] - fold2[1, ])
      index <- order(dist)[1:3]
      labels <- fold2[2, index]
      pred <- ifelse(sum(labels == 0) > sum(labels == 1), 0, 1)
      if (pred == fold1[2, j]) {
        accuracy[i] <- accuracy[i] + 1
      }
    }
  } else {
    # Use fold2 as validation set and fold1 as training set
    for (j in 1:5) {
      dist <- abs(fold2[1, j] - fold1[1, ])
      index <- order(dist)[1:3]
      labels <- fold1[2, index]
      pred <- ifelse(sum(labels == 0) > sum(labels == 1), 0, 1)
      if (pred == fold2[2, j]) {
        accuracy[i] <- accuracy[i] + 1
      }
    }
  }
}

# Compute the cross validation accuracy
cv_accuracy <- mean(accuracy/5) * 100
sprintf("Cross validation accuracy: %d%%", cv_accuracy)

[1] "Cross validation accuracy: 40%"

2 Cluster centers

# Cluster C1
C1 <- data.frame(x = c(0, 6), y = c(6, 0))
C1_center <- data.frame(x = mean(C1$x), y = mean(C1$y))
print(paste0("Center of cluster C1: (", C1_center$x, ", ", C1_center$y, ")"))

[1] "Center of cluster C1: (3, 3)"

# Cluster C2
C2 <- data.frame(x = c(2, 4, 6), y = c(2, 4, 6))
C2_center <- data.frame(x = mean(C2$x), y = mean(C2$y))
print(paste0("Center of cluster C2: (", C2_center$x, ", ", C2_center$y, ")"))

[1] "Center of cluster C2: (4, 4)"

# Cluster C3
C3 <- data.frame(x = c(5, 7), y = c(5, 7))
C3_center <- data.frame(x = mean(C3$x), y = mean(C3$y))
print(paste0("Center of cluster C3: (", C3_center$x, ", ", C3_center$y, ")"))

[1] "Center of cluster C3: (6, 6)"

3 Naïve Bayes classifier

# Training dataset
dtf1 <- matrix(c(0,1,0,1,0,
                 0,0,1,0,0,
                 1,1,1,0,0,
                 1,0,1,0,1),
               nrow=4, byrow=TRUE)
colnames(dtf1) <- c("X1", "X2", "X3", "X4", "l")
dtf1

     X1 X2 X3 X4 l
[1,]  0  1  0  1 0
[2,]  0  0  1  0 0
[3,]  1  1  1  0 0
[4,]  1  0  1  0 1

# Predicting dataset
dtf2 <- matrix(c(1,0,1,1,NA),
               nrow=1, byrow=TRUE)
colnames(dtf2) <- c("X1", "X2", "X3", "X4", "l")
dtf2

     X1 X2 X3 X4  l
[1,]  1  0  1  1 NA

# Label l
pl0 <- sum(dtf1[,5] == 0) / nrow(dtf1)
pl1 <- sum(dtf1[,5] == 1) / nrow(dtf1)

# Feature X1
p_x10 <- sum(dtf1[,1] == 0) / nrow(dtf1)
p_x11 <- sum(dtf1[,1] == 1) / nrow(dtf1)
p_x10_l0 <- sum(dtf1[,1] == 0 & dtf1[,5] == 0) / sum(dtf1[,5] == 0)
p_x10_l1 <- sum(dtf1[,1] == 0 & dtf1[,5] == 1) / sum(dtf1[,5] == 1)
p_x11_l0 <- sum(dtf1[,1] == 1 & dtf1[,5] == 0) / sum(dtf1[,5] == 0)
p_x11_l1 <- sum(dtf1[,1] == 1 & dtf1[,5] == 1) / sum(dtf1[,5] == 1)
p_l0_x10 <- (p_x10_l0 * pl0) / p_x10
p_l1_x10 <- (p_x10_l1 * pl1) / p_x10
p_l0_x11 <- (p_x11_l0 * pl0) / p_x11
p_l1_x11 <- (p_x11_l1 * pl1) / p_x11

# Feature X2
p_x20 <- sum(dtf1[,2] == 0) / nrow(dtf1)
p_x21 <- sum(dtf1[,2] == 1) / nrow(dtf1)
p_x20_l0 <- sum(dtf1[,2] == 0 & dtf1[,5] == 0) / sum(dtf1[,5] == 0)
p_x20_l1 <- sum(dtf1[,2] == 0 & dtf1[,5] == 1) / sum(dtf1[,5] == 1)
p_x21_l0 <- sum(dtf1[,2] == 1 & dtf1[,5] == 0) / sum(dtf1[,5] == 0)
p_x21_l1 <- sum(dtf1[,2] == 1 & dtf1[,5] == 1) / sum(dtf1[,5] == 1)
p_l0_x20 <- (p_x20_l0 * pl0) / p_x20
p_l1_x20 <- (p_x20_l1 * pl1) / p_x20
p_l0_x21 <- (p_x21_l0 * pl0) / p_x21
p_l1_x21 <- (p_x21_l1 * pl1) / p_x21

# Feature X3
p_x30 <- sum(dtf1[,3] == 0) / nrow(dtf1)
p_x31 <- sum(dtf1[,3] == 1) / nrow(dtf1)
p_x30_l0 <- sum(dtf1[,3] == 0 & dtf1[,5] == 0) / sum(dtf1[,5] == 0)
p_x30_l1 <- sum(dtf1[,3] == 0 & dtf1[,5] == 1) / sum(dtf1[,5] == 1)
p_x31_l0 <- sum(dtf1[,3] == 1 & dtf1[,5] == 0) / sum(dtf1[,5] == 0)
p_x31_l1 <- sum(dtf1[,3] == 1 & dtf1[,5] == 1) / sum(dtf1[,5] == 1)
p_l0_x30 <- (p_x30_l0 * pl0) / p_x30
p_l1_x30 <- (p_x30_l1 * pl1) / p_x30
p_l0_x31 <- (p_x31_l0 * pl0) / p_x31
p_l1_x31 <- (p_x31_l1 * pl1) / p_x31

# Feature X4
p_x40 <- sum(dtf1[,4] == 0) / nrow(dtf1)
p_x41 <- sum(dtf1[,4] == 1) / nrow(dtf1)
p_x40_l0 <- sum(dtf1[,4] == 0 & dtf1[,5] == 0) / sum(dtf1[,5] == 0)
p_x40_l1 <- sum(dtf1[,4] == 0 & dtf1[,5] == 1) / sum(dtf1[,5] == 1)
p_x41_l0 <- sum(dtf1[,4] == 1 & dtf1[,5] == 0) / sum(dtf1[,5] == 0)
p_x41_l1 <- sum(dtf1[,4] == 1 & dtf1[,5] == 1) / sum(dtf1[,5] == 1)
p_l0_x40 <- (p_x40_l0 * pl0) / p_x40
p_l1_x40 <- (p_x40_l1 * pl1) / p_x40
p_l0_x41 <- (p_x41_l0 * pl0) / p_x41
p_l1_x41 <- (p_x41_l1 * pl1) / p_x41

# Predicting probability
p0 <- sum(ifelse(dtf2[,1] == 0, p_l0_x10, p_l0_x11),
          ifelse(dtf2[,2] == 0, p_l0_x20, p_l0_x21),
          ifelse(dtf2[,3] == 0, p_l0_x30, p_l0_x31),
          ifelse(dtf2[,4] == 0, p_l0_x40, p_l0_x41))
p1 <- sum(ifelse(dtf2[,1] == 1, p_l1_x11, p_l1_x10),
          ifelse(dtf2[,2] == 1, p_l1_x21, p_l1_x20),
          ifelse(dtf2[,3] == 1, p_l1_x31, p_l1_x30),
          ifelse(dtf2[,4] == 1, p_l1_x41, p_l1_x40))

# Result
ifelse(p0 == p1, "Result: 0 or 1", ifelse(p0 > p1, "Result: 0", "Result: 1"))

[1] "Result: 0"

dtf2[,5] <- ifelse(p0 == p1, "0 or 1", ifelse(p0 > p1, 0, 1))
dtf2

     X1 X2 X3 X4 l
[1,]  1  0  1  1 0

4 Agglomerative Clustering

from scipy.cluster.hierarchy import linkage, dendrogram
import numpy as np
import matplotlib.pyplot as plt

D = np.array([[0, 8, 3, 5, 13],
              [8, 0, 6, 5, 10],
              [3, 6, 0, 8, 2],
              [5, 5, 8, 0, 7],
              [13, 10, 2, 7, 0]])

single = linkage(D, 'single')
dendrogram(single)
plt.title('Single linkage')
plt.show()

complete = linkage(D, 'complete')
dendrogram(complete)
plt.title('Complete linkage')
plt.show()

average = linkage(D, 'average')
dendrogram(average)
plt.title('Average linkage')
plt.show()

SessionInfo:

R version 4.2.3 (2023-03-15 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19044)

Matrix products: default

locale:
[1] LC_COLLATE=Chinese (Simplified)_China.utf8 
[2] LC_CTYPE=Chinese (Simplified)_China.utf8   
[3] LC_MONETARY=Chinese (Simplified)_China.utf8
[4] LC_NUMERIC=C                               
[5] LC_TIME=en_GB.UTF-8                        

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.10       lattice_0.20-45   png_0.1-8         digest_0.6.31    
 [5] grid_4.2.3        jsonlite_1.8.4    evaluate_0.20     rlang_1.1.0      
 [9] cli_3.6.1         rstudioapi_0.14   Matrix_1.5-3      reticulate_1.28  
[13] rmarkdown_2.21    tools_4.2.3       htmlwidgets_1.6.2 xfun_0.37        
[17] yaml_2.3.7        fastmap_1.1.0     compiler_4.2.3    htmltools_0.5.4  
[21] knitr_1.42